require(plyr)
require(dplyr)
require(tidyr)
require(foreign)
require(data.table)

#### IMPORT BASE DATA ####

# Set working directory
setwd("...")

# Import predicted data
dd = fread("./data/predictedData.csv")

# Create or redefine some key measures
dd$ccode = as.integer(dd$ccode)
dd = dd %>% drop_na(ccode)
dd$ccodeyear = paste(dd$ccode, dd$year, sep=" ")
dd$date = as.Date(dd$date)
dd$admin = factor(dd$admin, levels=c("Kennedy", "Johnson", "Nixon", "Ford"))

# Merge in country/region data
careg = fread("./data/regionData.csv")
dd = merge(dd, careg, by="ccode")

# Remove the United States since the PDB doesn't really report on the US
dd = dd %>% filter(ccode!=2)

# Remove entries that are far too short
dd = dd %>% filter(nWords >= 10)


#### PRODUCE COUNTRY-YEAR MEASURES ####

#### Calculate years since independence ####

# Import ICOW data
icow = read.csv("./outside_data/icowcol.csv")

# Create variable for year of independence
icow$indyear = substr(icow$Indep, 1, 4) 

# Vector of relevant years
yrs = 1961:1977

# Unique vector of countries
ccs = unique(icow$State)

# For each country, calculate years since independence between 1961 and 1977
ysiList = list()
for (i in 1:length(ccs)) {
  
  # Get country
  onestate = icow %>% filter(State==ccs[i]) %>% slice_tail()
  
  # Get country's year of independence
  oneind = as.numeric(onestate$indyear)
  
  # Calculate years since independence and determine decolonization variable
  ysi = yrs-oneind
  ysi[which(ysi < 0)] = 0
  decol = ifelse(onestate$Type==2, 1, 0)
  
  yd = data.frame(ccyear=paste(ccs[i], yrs, sep=" "),
                  ysi, decol)
  
  yd$decol[which(yd$ysi<0)] = 0
  
  # Remove very early decolonization before WWI
  if (oneind < 1918) { 
    yd$decol = 0
  }
  yd$ysi[which(yd$decol==0)] = 0
  
  # Save results to list
  ysiList[[i]] = yd
}

# Combine all results into data frame
ysiData = do.call("rbind", ysiList)

# Write out data frame, to be used in analyzeTropes.R
fwrite(ysiData, "./data/ysiData.csv")


#### Import other datasets ####

# Import autocratic regime data to get personalism measure
aow = read.csv("./outside_data/AoW_v1.csv")
aow$ccode[which(aow$ccode==364)] = 365 # Fix Soviet Union coding
aow$ccode[which(aow$ccode==769)] = 770 # Fix Pakistan coding
aow$ccyear = paste(aow$ccode, aow$year, sep=" ")
aow = aow %>% select(-ccode, -country, -year)
aow = aow %>% dplyr::select(ccyear, personal)
aow$personal[which(is.na(aow$personal))] = 0

# Import Polity data
pol = readxl::read_xlsx("./outside_data/p5v2018.xlsx")
pol$ccyear = paste(pol$ccode, pol$year, sep=" ")
pol$democracy = ifelse(pol$polity2 >= 6, 1, 0)
pol$democracy[which(is.na(pol$democracy))] = 0
pol = pol %>% dplyr::select(ccyear, democracy)

# Import diplomatic visit data
visit = foreign::read.dta("./outside_data/diplomatic_core.replication.dta")
visit = visit %>% filter(year > 1960 & year < 1978)
visit$ccyear = paste(visit$cowid, visit$year, sep=" ")
visit$USdefense[which(is.na(visit$USdefense))] = 0
visit = visit %>% select(ccyear, USmilaid, USdefense, UStrade)


#### Determine occurrence of conflict ####

# Import conflict data
prio = read.csv("./outside_data/ucdp-prio-acd-201.csv")
names(prio)
prio$start_date = as.Date(prio$start_date, "%Y-%m-%d")
prio$start_date2 = as.Date(prio$start_date2, "%Y-%m-%d")
prio$ep_end_date = as.Date(prio$ep_end_date, "%Y-%m-%d")

# Get rows where conflict episodes ended
pr = prio %>% filter(ep_end==1)

# Get conflicts that start before 1/21/77 or end after 6/13/61
pr = pr %>% filter(start_date2 <= "1977-01-21")
pr = pr %>% filter(ep_end_date >= "1961-06-13")

# Make participant-level data
pr = pr %>% pivot_longer(cols=c("gwno_a", "gwno_a_2nd", "gwno_b", "gwno_b_2nd"), names_to="side", values_to="side_id")
pr = pr %>% filter(side_id!="")

# For each row of our base data, determine whether conflict is taking place for relevant country-year
confList = list()
for(i in 1:nrow(dd)) {
  
  # Get one entry
  oneent = dd[i,]
  onedate = as.Date(oneent$date)
  
  # Check whether some entry involves a state where a conflict is taking place
  conf = pr[which(pr$start_date2 <= onedate & pr$ep_end_date >= onedate & pr$side_id %in% oneent$ccode),]
  
  # Identify intense conflicts
  conf2 = conf[which(conf$intensity_level==2),]
  confList[[i]]  = data.frame(conf_high=nrow(conf2))
  
  if (i %% 1000 == 0) {message(i)}
}
# Combine results
pconf = data.table::rbindlist(confList)


#### MERGE ALL DATA AND EXPORT ####

ddb = cbind(dd, pconf) # Base data and conflict 
ddb = merge(ddb, ysiData, by.x="ccodeyear", by.y="ccyear") # Merged data and years since independence
ddb = merge(ddb, aow, by.x="ccodeyear", by.y="ccyear", all.x=T) # Merged data and personalism
ddb = merge(ddb, pol, by.x="ccodeyear", by.y="ccyear", all.x=T) # Merged data and Polity/democracy
ddb = merge(ddb, visit, by.x="ccodeyear", by.y="ccyear", all.x=T) # Merged data and diplomatic visit data

# Fill NA entries for personalism and democracy with 0
ddb$personal[which(is.na(ddb$personal))] = 0
ddb$democracy[which(is.na(ddb$democracy))] = 0

# Make logged versions of years since independence and number of words in an entry
ddb$logYSI = log(ddb$ysi+1)
ddb$logWords = log(ddb$nWords+1)

# Extract key variables and write this out to CSV file 
dds = ddb |> dplyr::select(pdbID:entryNum, title, text, tt, 
                           globalsouth, logYSI, decol, geo_region, geo_region2, conf_high, democracy,
                           personal, leaderMention, leaderTenure, UStrade, USmilaid, USdefense, nWords, logWords,
                           country, ccode, starts_with("pred"), starts_with("count"), starts_with("Topic"))
fwrite(dds, "./data/tropeData.csv")